In [34]:
library(dplyr)
library(plotly)
library(flexdashboard)
#devtools::install_github("GIST-ORNL/wbstats")
library(wbstats)
#install.packages("countrycode")
library(countrycode)
library(rvest)
library(RColorBrewer)
library(lubridate)
library(broom)
library(DT)
#install.packages("listviewer")
library(listviewer)
library(tidyr)
#devtools::install_github("EvanOdell/hansard")
library(hansard)
library(xts)
library(zoo)
library(readr)
library(fuzzyjoin)
library(stringi)
library(IRdisplay)
In [2]:
# Download and parse OECD table
oecd_members <- read_html('http://www.oecd.org/about/membersandpartners/list-oecd-member-countries.htm') %>%
                    html_nodes('table') %>%
                    html_table(fill = TRUE, header = TRUE)
# Convert to countrycodes
oecd_codes <- countrycode(oecd_members[[2]]$Country, origin = "country.name", destination = "iso2c")
# Remove non-countries
oecd_codes <- oecd_codes[!is.na(oecd_codes)]
Warning message in countrycode(oecd_members[[2]]$Country, origin = "country.name", :
“Some values were not matched unambiguously: More on membership and enlargement
”
In [3]:
# Get data for women in parliament across the world
women_parliament <- wb(indicator = "SG.GEN.PARL.ZS", country=oecd_codes, startdate = 1997, enddate = 2017, POSIXct = TRUE) %>%
    transmute(date=date_ct, women_parliament=value, iso2c, country)

Countries in the OECD

In [4]:
cbind(country=oecd_members[[2]]$Country, oecd_codes)
Warning message in cbind(country = oecd_members[[2]]$Country, oecd_codes):
“number of rows of result is not a multiple of vector length (arg 2)”
countryoecd_codes
AUSTRALIA AU
AUSTRIA AT
BELGIUM BE
CANADA CA
CHILE CL
CZECH REPUBLIC CZ
DENMARK DK
ESTONIA EE
FINLAND FI
FRANCE FR
GERMANY DE
GREECE GR
HUNGARY HU
ICELAND IS
IRELAND IE
ISRAEL IL
ITALY IT
JAPAN JP
KOREA KR
LATVIA LV
LUXEMBOURG LU
MEXICO MX
NETHERLANDS NL
NEW ZEALAND NZ
NORWAY NO
POLAND PL
PORTUGAL PT
SLOVAK REPUBLIC SK
SLOVENIA SI
SPAIN ES
SWEDEN SE
SWITZERLAND CH
TURKEY TR
UNITED KINGDOM GB
UNITED STATES US
More on membership and enlargementAU
In [5]:
# Divide countries into bins by 2015 GDP/capita
country_bins = data.frame(label=c("Low Income", "Medium Income", "High Income"),
                          color=brewer.pal(3, "Set2"))
gdp_per_capita <- wb(indicator = "NY.GDP.PCAP.CD", country=oecd_codes, startdate = 2015, enddate = 2015) %>%
transmute(country=country,
          gdp_per_capita=value,
          binlabel=cut(log10(value),
                  breaks = 3, labels=country_bins$label),
         bincolor=cut(log10(value),
                  breaks = 3, labels=country_bins$color))
In [43]:
# Get country population
#population <- wb(indicator = "SP.POP.TOTL", country=oecd_codes, startdate=1997, enddate=2017, POSIXct = TRUE) %>%
#    transmute(country, date=date_ct, population=value)
In [6]:
# Alternative population source which is more up to date: http://data.worldbank.org/data-catalog/population-projection-tables
population <- read.csv("worldbank-population.csv") %>%
    select(-Series.Name, -Series.Code) %>%
    gather(date, population, -Country.Name, -Country.Code) %>%
    mutate(date=as.Date(paste0(substr(date, 2, 5) , '-01-01'))) %>%
    filter(Country.Code %in% countrycode(oecd_codes, origin = "iso2c", destination = "iso3c")) %>%
    transmute(country=Country.Name,date,population)
In [84]:
p <-
gdp_per_capita %>% group_by(binlabel) %>% arrange(gdp_per_capita) %>%
    plot_ly(x=~country, y=~gdp_per_capita,
            xaxis=list(categorymode="array"), color=~binlabel,
           type="bar") %>%
    layout(xaxis=list(title="Country", tickangle=-45, showgrid=F),
           yaxis=list(title="GDP per Capita", showgrid=F),
          margin=list(b=100),
           paper_bgcolor="transparent",
            plot_bgcolor="transparent",
            font=list(color="lightgrey"),
          legend = list(x = 0.05, y = 1, bgcolor = "#333333"))

p
In [85]:
df <- women_parliament %>%
    left_join(gdp_per_capita, by=c("country"="country")) %>%
    left_join(population, by=c("country"="country", "date")) %>%
    filter(!is.na(population)) %>%
    group_by(country)
p <- df %>% filter(iso2c != "GB") %>%
    plot_ly(x=~date, y=~women_parliament,
            type = "scatter", mode="lines",
            opacity=0.15, line=list(width=3), color=~binlabel,
            text=~paste0("<b>", country, "</b>","<br>", year(date),": ",
                         round(women_parliament, 0), "%"),
            hoverinfo="text",
            colors=brewer.pal(3, "Set2")
           )
low_income <- df %>% filter(binlabel=="Low Income")
low_income$loess <- fitted(loess(women_parliament ~ as.numeric(date),
                               weights = population, data=low_income))

# Calculate LOESS lines: locally weighted best fits across each income group
low_income <- df %>% filter(binlabel=="Low Income")
low_income$loess <- fitted(loess(women_parliament ~ as.numeric(date),
                               weights = log10(population), data=low_income))
medium_income <- df %>% filter(binlabel=="Medium Income")
medium_income$loess <- fitted(loess(women_parliament ~ as.numeric(date),
                               weights = log10(population), data=medium_income))
high_income <- df %>% filter(binlabel=="High Income")
high_income$loess <- fitted(loess(women_parliament ~ as.numeric(date),
                               weights = log10(population), data=high_income))
# Plot 'em
p <- add_lines(p, opacity=1, data=low_income, x=~date, inherit = F, line=list(width=4, dash="dash", color=brewer.pal(3, "Set2")[1]),
               y=~loess, name="Low Income average",
               text=~paste0("<b>", "Low Income average", "</b>","<br>", year(date), ": ",
                                                  round(loess, 0), "%"), hoverinfo="text")
p <- add_lines(p, opacity=1, data=medium_income, x=~date, line=list(width=4, dash="dash"),
               y=~loess, name="Medium Income average", colors=brewer.pal(3, "Set2"),
               text=~paste0("<b>", "Medium Income average", "</b>","<br>", year(date), ": ",
                                                  round(loess, 0), "%"), hoverinfo="text")
p <- add_lines(p, opacity=1, data=high_income, x=~date, line=list(width=4, dash="dash"),
               y=~loess, name="High Income average", colors=brewer.pal(3, "Set2"),
               text=~paste0("<b>", "High Income average", "</b>","<br>", year(date), ": ",
                                                  round(loess, 0), "%"), hoverinfo="text")
# Add UK line: would be nice to add it as part of the original group but can't specify opacity as an array
p <- df %>% filter(iso2c == "GB") %>%
    add_trace(p, data=., x=~date, inherit = F, line=list(width=4, color="Red"),
              y=~women_parliament, opacity=1.0, name="United Kingdom",
              type="scatter", mode="lines",
              text=~paste0("<b>", country, "</b>","<br>", year(date),": ",
                         round(women_parliament, 0), "%"), hoverinfo="text")

# Format the graph
p <- layout(p, hovermode="closest", xaxis=list(title="Year", showgrid=FALSE),
            yaxis=list(title="% of women in parliament", showgrid=FALSE,
                      range=c(0,50)),
            paper_bgcolor="transparent",
            plot_bgcolor="transparent",
            font=list(color="lightgrey"),
            legend = list(yanchor="center"))

p
Warning message in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y):
“joining factor and character vector, coercing into character vector”
In [9]:
# Data from http://researchbriefings.files.parliament.uk/documents/SN01250/snsg-01250-Tables-for-download-Feb-2017-.xlsx
# Extracting from this excel file via read_excel() is buggy so I just copied and pasted it here.
women_historic <- "
Year	Con	Lab	LD	Other	Total	Pct_MPs
1918	0	0	0	1	1	0.10%
1922	1	0	1	0	2	0.30%
1923	3	3	2	0	8	1.30%
1924	3	1	0	0	4	0.70%
1929	3	9	1	1	14	2.30%
1931	13	0	1	1	15	2.40%
1935	6	1	1	1	9	1.50%
1945	1	21	1	1	24	3.80%
1950	6	14	0	1	21	3.40%
1951	6	11	0	0	17	2.70%
1955	10	14	0	0	24	3.80%
1959	12	13	0	0	25	4.00%
1964	11	18	0	0	29	4.60%
1966	7	19	0	0	26	4.10%
1970	15	10	0	1	26	4.10%
1974	9	13	0	1	23	3.60%
1974	7	18	0	2	27	4.30%
1979	8	11	0	0	19	3.00%
1983	13	10	0	0	23	3.50%
1987	17	21	2	1	41	6.30%
1992	20	37	2	1	60	9.20%
1997	13	101	3	3	120	18.20%
2001	14	95	5	4	118	17.90%
2005	17	98	10	3	128	19.80%
2010	49	81	7	6	143	22.00%
2015	68	99	0	24	191	29.40%
"
women_historic <- read.csv(text=women_historic, sep="\t") %>%
    mutate(Year = as.Date(paste0(Year,"-01-01")), Pct_MPs = as.numeric(gsub("%$", "", Pct_MPs)))
In [12]:
# Get total number of commons MPs over time
# from https://en.wikipedia.org/wiki/Number_of_Westminster_MPs?oldformat=true#Number_of_MPs_by_country
number_of_constituencies <- "
Country	1918	1922	1945	1950	1955	1974	1983	1992	1997	2005	2010	2015
Total	707	615	640	625	630	635	650	651	659	646	650	650"
number_of_constituencies <- read.table(text=number_of_constituencies, header = T) %>%
    gather(year, total) %>% # Turn wide table into long table
    tail(n=-1) %>% # skip first row (old header)
    mutate(year=as.Date(paste0(gsub("^X", "", year), "-01-01")), total=as.integer(total)) # Format year as a date

# Convert to time series to interpolate number of MPs between years
no_con <- xts(number_of_constituencies[,-1], order.by=number_of_constituencies[,1])
no_con <- seq(start(no_con), end(no_con), by="1 year") %>%
    merge(no_con, .) %>%
    na.approx(method="constant") # Use constant interpolation because MPs only change numbers at elections
# Convert back to dataframe
number_of_constituencies <- data.frame(year=index(no_con), total=no_con) %>% transmute(year, total_mps=no_con)
Warning message:
“attributes are not identical across measure variables; they will be dropped”
In [13]:
# Merge dataframes to get total number of MPs for each election year
historic_number_of_women <- left_join(women_historic, number_of_constituencies, by=c("Year"="year")) %>%
    mutate(Pct_Women=round(Total/total_mps*100, digits=2)) # Check if Pct_MPs calculation is done correctly
head(historic_number_of_women)
YearConLabLDOtherTotalPct_MPstotal_mpsPct_Women
1918-01-01 0 0 0 1 1 0.1 707 0.14
1922-01-01 1 0 1 0 2 0.3 615 0.33
1923-01-01 3 3 2 0 8 1.3 615 1.30
1924-01-01 3 1 0 0 4 0.7 615 0.65
1929-01-01 3 9 1 1 14 2.3 615 2.28
1931-01-0113 0 1 1 15 2.4 615 2.44
In [14]:
# Historic election dates from http://researchbriefings.parliament.uk/ResearchBriefing/Summary/SN04512#fullreport
historic_election_dates <- 
"1918-12-14
1922-11-15
1923-12-06
1924-10-29
1929-05-30
1931-10-27
1935-11-14"
historic_election_dates <- as.Date(strsplit(historic_election_dates, "\n")[[1]])
# Modern election dates from hansard
general_elections <- elections("all") %>% filter(electionType=="General Election")
# Concatenate modern election dates from hansard with historic ones and set as Year vector
historic_number_of_women$Year <- c(historic_election_dates, rev(as.Date(general_elections$date._value)))
Connecting to API
Retrieving page 1 of 1
In [83]:
p <-
historic_number_of_women %>%
    select(-Total, -Pct_MPs, -total_mps, -Pct_Women) %>%
    gather(party, number_of_women, -Year) %>%
    plot_ly(x=~Year, y=~number_of_women, color=~party, mode="lines+markers", type="scatter", fill="tozeroy") %>%
    # Format the graph
    layout(hovermode="closest", xaxis=list(title="Year", showgrid=FALSE),
            yaxis=list(title="% of women in parliament", showgrid=FALSE),
            paper_bgcolor="transparent",
            plot_bgcolor="transparent",
            font=list(color="lightgrey"))

p
In [16]:
female_mps <- read_csv("women_parliament.csv", 
    col_types = cols(`Date first elected` = col_date(format = "%d/%m/%Y"), 
        X6 = col_character(), X7 = col_skip(), 
        X8 = col_skip()), skip = 1)
names(female_mps) <- c("name", "date.first.elected", "party", "constituency", "time.in.parliament", "notes")

## Clean up the dataframe
# Remove first line, which is blank
female_mps <- female_mps[-1,]

# If there is an NA in the party column, it is because the MP constituencies overflowed over two rows.
# Let's fix this by tacking on the overflowed constituency to the previous row

# Save the last row's constituency because this is accidentally overwritten by the next line
last_constituency <- tail(female_mps, n=1)$constituency
last_time <- tail(female_mps, n=1)$time.in.parliament
female_mps <-female_mps %>%
    mutate(constituency=ifelse(lead(is.na(party)), # If the next row has a NA in the party column
                              paste0(constituency, lead(ifelse(is.na(constituency),
                                                              "", constituency))), # Take its constituency value and add it to this row's
                              constituency), # Otherwise, do nothing
           time.in.parliament=ifelse(lead(is.na(party)), # Do the same for time.in.parliament
                                     paste0(time.in.parliament, lead(ifelse(is.na(time.in.parliament), # Take value from row below and add it to the end of current row
                                                                           "", paste0(";", time.in.parliament)))),
                                     time.in.parliament)) %>% # Otherwise, do nothing
    mutate(notes=ifelse(lead(is.na(party) & !is.na(name)), # Now check for instances where the notes have overflowed onto the next row
                        paste0(ifelse(is.na(notes), "", paste0(notes, ", ")), lead(name)), # Append notes onto the row
                        notes)) %>%
    filter(!is.na(party)) # Filter out the unneeded rows

# Add the last constituency back in
female_mps$constituency[nrow(female_mps)] <- last_constituency
female_mps$time.in.parliament[nrow(female_mps)] <- last_time

# Make constituency and time.in.parliament into character type string
female_mps <- female_mps %>% mutate(constituency = as.character(constituency),
                                    time.in.parliament = as.character(time.in.parliament))
Warning message:
“Missing column names filled in: 'X1' [1], 'X6' [6], 'X7' [7], 'X8' [8]”
In [68]:
female_mps %>% datatable()
In [19]:
# Separate different tenures as MPs into multiple rows 
new_df <- data.frame()
female_mps$nth.term <- NA
for(i in 1:nrow(female_mps)) {
    # Split time into multiple tenures using ; delimiter
    time_split <- strsplit(as.character(female_mps[i,]$time.in.parliament), ";", fixed=T)[[1]]
    time_split <- time_split[time_split != ""] # Remove empty strings
    constituency_split <- strsplit(as.character(female_mps[i,]$constituency), ";", fixed=T)[[1]]
    constituency_split <- constituency_split[constituency_split != ""]
    # Loop over all times in parliament
    if(length(time_split) > 1) {
        # First correct the row in the original df
        female_mps[i,]$time.in.parliament <- time_split[1]
        # Set the first term number to 1
        female_mps[i,]$nth.term <- 1
        for(i_time in 2:length(time_split)) {
            # Create new row with current row's values
            new_row <- female_mps[i,]
            new_row$time.in.parliament <- time_split[i_time] #  Change new row's time.in.parliament to reflect the loop
            new_row$nth.term <- i_time # Set the term number to the count so that we can track which term this is
            if(length(time_split)==length(constituency_split)) { # If there are an equal number of constituency and time splits
                # then we can assign them 1:1
                # Correct the contituency value
                female_mps[i,]$constituency <- constituency_split[1]
                new_row$constituency <- trimws(strsplit(constituency_split[i_time], "(", fixed=T)[[1]][1]) # Change new row's constituency to reflect the loop
            }
            # Add row to df
            new_df <- rbind(new_df, new_row)
        }
    } else if (length(constituency_split) > 1) {
        # First correct the row in the original df
        female_mps[i,]$constituency <- constituency_split[1]
        female_mps[i,]$nth.term <- 1
        # This is messy but if the MP does not have a term end, we tack on the start of the next term as the end of the previous one
        if (tail(strsplit(time_split[1], "")[[1]], n=1) == "-") { # If first term range ends with a -, then there is no term end date
            female_mps[i,]$time.in.parliament <- gsub("--", "-", paste0(time_split[1],
                                                        "-",
                                                        strsplit(gsub(")", "", strsplit(constituency_split[2],
                                                                               "(",
                                                                               fixed=T)[[1]][2],
                                                             fixed=T), "-", fixed=T)[1]), fixed=T)
        } else {
            female_mps[i,]$time.in.parliament <- gsub(")",
                                                      "",
                                                      strsplit(constituency_split[1],
                                                               "(", fixed=T)[[1]][2],
                                                      fixed=T)
        }
        # For each constituency that the MP has been elected in,
        for(i_constituency in 2:length(constituency_split)) {
            # Create new row with current row's values
            new_row <- female_mps[i,]
            new_row$nth.term <- i_constituency # Set the term number to the count so that we can track which term this is
            # Change new row's constituency to reflect the loop
            new_row$constituency <- trimws(strsplit(constituency_split[i_constituency], "(", fixed=T)[[1]][1])
            # Do the same for the time.in.parliament by taking the time range from within brackets
            new_row$time.in.parliament <- gsub(")", "", strsplit(constituency_split[i_constituency], "(", fixed=T)[[1]][2], fixed=T)
            # Add row to df
            new_df <- rbind(new_df, new_row)
        }
    } else { # For those MPS with only one term served
        female_mps[i,]$nth.term <- 1
    }
}

female_mps <- rbind(female_mps, new_df)
female_mps <- female_mps %>% mutate(time.in.parliament=sub(";", "", time.in.parliament)) # Remove superfluous ";"s
In [20]:
female_mps %>% filter(is.na(time.in.parliament))
namedate.first.electedpartyconstituencytime.in.parliamentnotesnth.term
Mrs Mavis TATE 1931-10-27 Con Willesden, West NA NA 1
Rt Hon Dr Edith SUMMERSKILL, CH 1938-04-06 Lab Fulham NA NA 1
Mrs Jennie ADAMSON 1938-11-07 Lab Dartford NA NA 1
Rt Hon Alice BACON, CBE 1945-07-05 Lab Leeds, North East NA NA 1
Rt Hon Barbara CASTLE 1945-07-05 Lab Blackburn NA NA 1
Mrs Freda CORBET 1945-07-05 Lab Camberwell, North-West NA NA 1
Mrs Jean MANN 1945-07-05 Lab Lanarkshire, Coatbridge NA NA 1
Rt Hon Dame Judith HART, DBE 1959-10-08 Lab Lanark NA NA 1
Rt Hon Shirley WILLIAMS 1964-10-15 Lab Hertfordshire, Hitchin NA NA 1
Dame Janet FOOKES, DBE 1970-06-18 Con Merton & Morden NA (Second Deputy Chairman of Ways and Means) 1
Rt Hon Betty BOOTHROYD 1973-05-24 Lab West Bromwich NA (First woman Speaker) 1
Dame Angela RUMBOLD, DBE 1982-06-03 Con Merton, Mitcham and Morden NA NA 1
Rt Hon Joyce QUIN 1987-06-11 Lab Gateshead, East NA NA 1
Rt Hon Ann WIDDECOMBE 1987-06-11 Con Maidstone NA NA 1
Mrs Angela BROWNING 1992-04-09 Con Tiverton NA NA 1
Rt Hon Jane KENNEDY 1992-04-09 Lab Liverpool, Broad Green NA NA 1
Ms Rachel SQUIRE 1992-04-09 Lab Dunfermline West NA NA 1
Ms Roseanna CUNNINGHAM 1995-06-25 SNP Perth & Kinross NA NA 1
Mrs Rosemary McKENNA, CBE 1997-05-01 Lab Cumbernauld and Kilsyth NA NA 1
Rt Hon Betty BOOTHROYD 1973-05-24 Lab West Bromwich, West NA (First woman Speaker) 2
In [21]:
female_mps <- female_mps %>% rowwise() %>%
    mutate(term.start = trimws(gsub("*", "", strsplit(time.in.parliament, "-")[[1]][1], fixed=T)),
           term.end=strsplit(time.in.parliament, "-")[[1]][2],
           current=substring(time.in.parliament, nchar(time.in.parliament))=="-",
           by.election=grepl("*", time.in.parliament, fixed=T)) # If * in time.in.parliament, then MP was elected at a by-election
In [22]:
# Clean up term.end: need to turn (35, Feb 60, 2005, 10, NA) into (1935, 02/1960, 2005, 2010, [term.start])
female_mps <-
    female_mps %>% rowwise() %>% mutate(term.start = ifelse(nchar(term.start) == 4,
                                                  as.Date(paste0(term.start, "-01-01")),
                                                  as.Date(paste("01", term.start), format="%d %b %Y")),
                         term.end = ifelse(!grepl("-", time.in.parliament), # If no dash is present
                                          term.start, # then MP ended term in the same year
                                          ifelse(nchar(term.end) == 2, # Otherwise, if year is 2 char
                                                ifelse(as.integer(term.end) > 17, # And if term.end > 17
                                                      as.Date(paste0("19", term.end, "-01-01")), # Then the year is in 1900s
                                                      as.Date(paste0("20", term.end, "-01-01"))), # Otherwise, it's in 2000s
                                                ifelse(nchar(term.end) == 4, # If year is instead 4 char 
                                                      as.Date(paste0(term.end, "-01-01")), # Just convert to date
                                                      as.Date(paste("01", term.end), format="%d %b %y"))))) %>%# If it has format Feb 60 -> 01 Feb 60 and convert to Date
    mutate(term.start = as.Date(term.start),
           term.end = as.Date(term.end))

female_mps <- female_mps %>% mutate(term.end=as.Date(term.end))

# Use different name for better matching
female_mps[female_mps$name=="Catherine SMITH",]$name <- "Cat SMITH"
female_mps[female_mps$name=="Caroline DINEAGE",]$name <- "Caroline DINENAGE"
female_mps[female_mps$name=="Miss Margaret JACKSON",]$name <- "Rt Hon Dame Margaret BECKETT"
In [69]:
# Cross reference with list of women MPs on data.parliament.uk
library(jsonlite)
library(httr)

female_mps_from_mnis <- fromJSON(content(GET(url="http://data.parliament.uk/membersdataplatform/services/mnis/members/query/House=Commons|Gender=F|Membership=all", add_headers(`content-type`="application/json")), type="text"))

female_mps_from_mnis <- female_mps_from_mnis$Members[[1]] %>%
    select(DisplayAs, House, HouseStartDate, HouseEndDate) %>%
    rowwise() %>%
    mutate(HouseEndDate=as.Date(ifelse(typeof(HouseEndDate)=="character",
                                       HouseEndDate,
                                       NA),
                                format="%Y-%m-%d"))
No encoding supplied: defaulting to UTF-8.
Warning message:
“JSON string contains (illegal) UTF8 byte-order-mark!”
In [24]:
honorifics <- 'Mr|Mrs|Ms|Miss|Advocate|Ambassador|Baron|Baroness|Brigadier|Canon|Captain|Chancellor|Chief|Col|Comdr|Commodore|Councillor|Count|Countess|Dame|Dr|Duke of|Earl|Earl of|Father|General|Group Captain|H R H the Duchess of|H R H the Duke of|H R H The Princess|HE Mr|HE Senora|HE The French Ambassador M|His Highness|His Hon|His Hon Judge|Hon|Hon Ambassador|Hon Dr|Hon Lady|Hon Mrs|HRH|HRH Sultan Shah|HRH The|HRH The Prince|HRH The Princess|HSH Princess|HSH The Prince|Judge|King|Lady|Lord|Lord and Lady|Lord Justice|Lt Cdr|Lt Col|Madam|Madame|Maj|Maj Gen|Major|Marchesa|Marchese|Marchioness|Marchioness of|Marquess|Marquess of|Marquis|Marquise|Master|Mr and Mrs|Mr and The Hon Mrs|President|Prince|Princess|Princessin|Prof|Prof Emeritus|Prof Dame|Professor|Queen|Rabbi|Representative|Rev Canon|Rev Dr|Rev Mgr|Rev Preb|Reverend|Reverend Father|Right Rev|Rt Hon|Rt Hon Baroness|Rt Hon Lord|Rt Hon Sir|Rt Hon The Earl|Rt Hon Viscount|Senator|Sir|Sister|Sultan|The Baroness|The Countess|The Countess of|The Dowager Marchioness of|The Duchess|The Duchess of|The Duke of|The Earl of|The Hon|The Hon Mr|The Hon Mrs|The Hon Ms|The Hon Sir|The Lady|The Lord|The Marchioness of|The Princess|The Reverend|The Rt Hon|The Rt Hon Lord|The Rt Hon Sir|The Rt Hon The Lord|The Rt Hon the Viscount|The Rt Hon Viscount|The Venerable|The Very Rev Dr|Very Reverend|Viscondessa|Viscount|Viscount and Viscountess|Viscountess|W Baron|W/Cdr'
honorifics <- paste0("(\\b", paste(strsplit(honorifics, "|", fixed=T)[[1]], collapse="\\b|\\b"), "\\b)")
unwanted_chars <- "'|’|-| |\\.|,"
In [25]:
female_mps_from_mnis <- female_mps_from_mnis %>%
    mutate(clean.name = tolower(trimws(stri_trans_general(gsub(unwanted_chars, "", gsub(honorifics, "", DisplayAs)), "Latin-ASCII"))))
In [26]:
female_mps <- female_mps %>%
    mutate(clean.name = tolower(trimws(stri_trans_general(gsub(unwanted_chars, "", gsub(honorifics, "", name)), "Latin-ASCII")))) %>%
    stringdist_left_join(female_mps_from_mnis) %>%
    mutate(term.end=as.Date(ifelse(!is.na(HouseEndDate) & current==T, HouseEndDate, term.end))) %>%
    select(-DisplayAs, -House, -HouseStartDate, -HouseEndDate, -clean.name.y) %>%
    filter(!is.na(time.in.parliament)) # Remove rows without a time in parliament: likely to be spurious at this point.
Joining by: "clean.name"
In [27]:
output <- female_mps  %>%
    mutate(party=ifelse(party=="Lab", "red",
                        ifelse(party=="Con", "blue",
                               ifelse(party %in% c("Lib Dem", "LD"), "orange",
                                     ifelse(party=="SNP", "yellow",
                                           ifelse(party=="Green", "green",
                                                 "black")))))) %>%
    mutate(term.end=as.Date(ifelse(is.na(term.end),
                                   as.Date("2020-01-01"),
                                   term.end))) %>%
    transmute(name, constituency, term_start=term.start, term_end=term.end, party, clean_name=clean.name.x) %>%
    arrange(term_start)

streams <- c()
output$stream <- 0
for(i in 1:nrow(output)) {
    row <- output[i,]
    streams[i] <- 0
    for(stream_id in 1:length(streams)) {
        if(decimal_date(output[i,]$term_start) >= streams[stream_id]) {
            streams[stream_id] <- decimal_date(output[i,]$term_end)
            output[i,]$stream <- stream_id
            break
        }
    }
}

output %>% write.csv("mps_over_time.csv", row.names=F)
In [67]:
p <- output %>% datatable()
saveWidget(p, "final_table.html")
display_html('<iframe src="final_table.html" width="100%" height="800">')